More ggplot2

MACS 40700 University of Chicago

Group data and the group aesthetic

library(gapminder)

ggplot(data = gapminder,
       mapping = aes(x = year,
                     y = lifeExp)) +
  geom_line()

Group data and the group aesthetic

ggplot(data = gapminder,
       mapping = aes(x = year,
                     y = lifeExp,
                     group = country)) +
  geom_line()

Group data and the group aesthetic

ggplot(data = gapminder,
       mapping = aes(x = lifeExp,
                     y = gdpPercap,
                     color = continent)) +
  geom_point()

ggplot(data = gapminder,
       mapping = aes(x = lifeExp,
                     y = gdpPercap,
                     color = continent,
                     # redundant
                     group = continent)) +
  geom_point()

Statistical transformations and temporary variables

ggplot(data = gapminder,
       mapping = aes(x = year,
                     y = lifeExp)) +
  geom_line(aes(group = country), alpha = .1) +
  geom_smooth()

Statistical transformations and temporary variables

ggplot(data = gapminder,
       mapping = aes(x = year,
                     y = lifeExp)) +
  geom_line(aes(group = country), alpha = .1) +
  geom_smooth(method = "lm")

Statistical transformations and temporary variables

ggplot(data = gss_sm,
            mapping = aes(x = bigregion)) +
  geom_bar()

Statistical transformations and temporary variables

ggplot(data = gss_sm,
            mapping = aes(x = bigregion)) +
  geom_bar(aes(y = ..prop..))

Statistical transformations and temporary variables

ggplot(data = gss_sm,
            mapping = aes(x = bigregion,
                          group = 1)) +
  geom_bar(aes(y = ..prop..))

Frequency plots

ggplot(data = gss_sm,
       mapping = aes(x = religion,
                     fill = religion)) +
  geom_bar()

ggplot(data = gss_sm,
       mapping = aes(x = religion, fill = religion)) +
  geom_bar() +
  guides(fill = FALSE) 

The awkward way

ggplot(data = gss_sm,
       mapping = aes(x = bigregion,
                     fill = religion)) +
  geom_bar()

The awkward way

ggplot(data = gss_sm,
       mapping = aes(x = bigregion,
                     fill = religion)) +
  geom_bar(position = "fill")

The awkward way

ggplot(data = gss_sm,
       mapping = aes(x = bigregion,
                     fill = religion)) +
  geom_bar(position = "dodge")

The awkward way

ggplot(data = gss_sm,
       mapping = aes(x = bigregion,
                     fill = religion)) +
  geom_bar(aes(y = ..prop..), position = "dodge")

The awkward way

ggplot(data = gss_sm,
       mapping = aes(x = bigregion,
                     fill = religion)) +
  geom_bar(aes(y = ..prop..,
               group = religion), position = "dodge")

The awkward way

ggplot(data = gss_sm,
       mapping = aes(x = religion)) +
  geom_bar(aes(y = ..prop..,
               group = bigregion), position = "dodge") +
    facet_wrap(~ bigregion)

Calculate manually

glimpse(gss_sm)
## Observations: 2,867
## Variables: 32
## $ year        <dbl> 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 20...
## $ id          <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ ballot      <dbl+lbl> 1, 2, 3, 1, 3, 2, 1, 3, 1, 3, 2, 1, 2, 3, 2, 3...
## $ age         <dbl> 47, 61, 72, 43, 55, 53, 50, 23, 45, 71, 33, 86, 32...
## $ childs      <dbl> 3, 0, 2, 4, 2, 2, 2, 3, 3, 4, 5, 4, 3, 5, 7, 2, 6,...
## $ sibs        <dbl+lbl> 2, 3, 3, 3, 2, 2, 2, 6, 5, 1, 4, 4, 3, 6, 0, 1...
## $ degree      <fct> Bachelor, High School, Bachelor, High School, Grad...
## $ race        <fct> White, White, White, White, White, White, White, O...
## $ sex         <fct> Male, Male, Male, Female, Female, Female, Male, Fe...
## $ region      <fct> New England, New England, New England, New England...
## $ income16    <fct> $170000 or over, $50000 to 59999, $75000 to $89999...
## $ relig       <fct> None, None, Catholic, Catholic, None, None, None, ...
## $ marital     <fct> Married, Never Married, Married, Married, Married,...
## $ padeg       <fct> Graduate, Lt High School, High School, NA, Bachelo...
## $ madeg       <fct> High School, High School, Lt High School, High Sch...
## $ partyid     <fct> Independent, Ind,near Dem, Not Str Republican, Not...
## $ polviews    <fct> Moderate, Liberal, Conservative, Moderate, Slightl...
## $ happy       <fct> Pretty Happy, Pretty Happy, Very Happy, Pretty Hap...
## $ partners    <fct> NA, 1 Partner, 1 Partner, NA, 1 Partner, 1 Partner...
## $ grass       <fct> NA, Legal, Not Legal, NA, Legal, Legal, NA, Not Le...
## $ zodiac      <fct> Aquarius, Scorpio, Pisces, Cancer, Scorpio, Scorpi...
## $ pres12      <dbl+lbl> 3, 1, 2, 2, 1, 1, NA, NA, NA, 2, NA, NA, 1, 1,...
## $ wtssall     <dbl> 0.957, 0.478, 0.957, 1.914, 1.435, 0.957, 1.435, 0...
## $ income_rc   <fct> Gt $170000, Gt $50000, Gt $75000, Gt $170000, Gt $...
## $ agegrp      <fct> Age 45-55, Age 55-65, Age 65+, Age 35-45, Age 45-5...
## $ ageq        <fct> Age 34-49, Age 49-62, Age 62+, Age 34-49, Age 49-6...
## $ siblings    <fct> 2, 3, 3, 3, 2, 2, 2, 6+, 5, 1, 4, 4, 3, 6+, 0, 1, ...
## $ kids        <fct> 3, 0, 2, 4+, 2, 2, 2, 3, 3, 4+, 4+, 4+, 3, 4+, 4+,...
## $ religion    <fct> None, None, Catholic, Catholic, None, None, None, ...
## $ bigregion   <fct> Northeast, Northeast, Northeast, Northeast, Northe...
## $ partners_rc <fct> NA, 1, 1, NA, 1, 1, NA, 1, NA, 3, 1, NA, 1, NA, 0,...
## $ obama       <dbl> 0, 1, 0, 0, 1, 1, NA, NA, NA, 0, NA, NA, 1, 1, 0, ...
(rel_by_region <- gss_sm %>%
    group_by(bigregion, religion) %>%
    summarize(N = n()) %>%
    mutate(freq = N / sum(N),
           pct = round((freq * 100), 0)))
## # A tibble: 24 x 5
## # Groups:   bigregion [4]
##    bigregion religion       N    freq   pct
##    <fct>     <fct>      <int>   <dbl> <dbl>
##  1 Northeast Protestant   158 0.324     32.
##  2 Northeast Catholic     162 0.332     33.
##  3 Northeast Jewish        27 0.0553     6.
##  4 Northeast None         112 0.230     23.
##  5 Northeast Other         28 0.0574     6.
##  6 Northeast <NA>           1 0.00205    0.
##  7 Midwest   Protestant   325 0.468     47.
##  8 Midwest   Catholic     172 0.247     25.
##  9 Midwest   Jewish         3 0.00432    0.
## 10 Midwest   None         157 0.226     23.
## # ... with 14 more rows

Calculate manually

ggplot(data = rel_by_region,
       mapping = aes(x = bigregion,
                     y = pct,
                     fill = religion)) +
  geom_col(position = "dodge2") +
  labs(x = "Region", y = "Percent", fill = "Religion") +
  theme(legend.position = "top")

Calculate manually

ggplot(data = rel_by_region,
       mapping = aes(x = religion,
                     y = pct,
                     fill = religion)) +
  geom_col(position = "dodge2") +
  labs(x = "Region", y = "Percent", fill = "Religion") +
  guides(fill = FALSE) + 
  coord_flip() + 
  facet_grid(~ bigregion)

Continuous variables by group or category

glimpse(organdata)
## Observations: 238
## Variables: 21
## $ country          <chr> "Australia", "Australia", "Australia", "Austr...
## $ year             <date> NA, 1991-01-01, 1992-01-01, 1993-01-01, 1994...
## $ donors           <dbl> NA, 12.09, 12.35, 12.51, 10.25, 10.18, 10.59,...
## $ pop              <int> 17065, 17284, 17495, 17667, 17855, 18072, 183...
## $ pop_dens         <dbl> 0.220, 0.223, 0.226, 0.228, 0.231, 0.233, 0.2...
## $ gdp              <int> 16774, 17171, 17914, 18883, 19849, 21079, 219...
## $ gdp_lag          <int> 16591, 16774, 17171, 17914, 18883, 19849, 210...
## $ health           <dbl> 1300, 1379, 1455, 1540, 1626, 1737, 1846, 194...
## $ health_lag       <dbl> 1224, 1300, 1379, 1455, 1540, 1626, 1737, 184...
## $ pubhealth        <dbl> 4.8, 5.4, 5.4, 5.4, 5.4, 5.5, 5.6, 5.7, 5.9, ...
## $ roads            <dbl> 136.6, 122.3, 112.8, 110.5, 108.0, 111.6, 107...
## $ cerebvas         <int> 682, 647, 630, 611, 631, 592, 576, 525, 516, ...
## $ assault          <int> 21, 19, 17, 18, 17, 16, 17, 17, 16, 15, 16, 1...
## $ external         <int> 444, 425, 406, 376, 387, 371, 395, 385, 410, ...
## $ txp_pop          <dbl> 0.938, 0.926, 0.915, 0.906, 0.896, 0.885, 0.8...
## $ world            <chr> "Liberal", "Liberal", "Liberal", "Liberal", "...
## $ opt              <chr> "In", "In", "In", "In", "In", "In", "In", "In...
## $ consent_law      <chr> "Informed", "Informed", "Informed", "Informed...
## $ consent_practice <chr> "Informed", "Informed", "Informed", "Informed...
## $ consistent       <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Ye...
## $ ccode            <chr> "Oz", "Oz", "Oz", "Oz", "Oz", "Oz", "Oz", "Oz...

Scatterplot

ggplot(data = organdata,
            mapping = aes(x = year, y = donors)) +
  geom_point()

Line plots

ggplot(data = organdata,
            mapping = aes(x = year, y = donors)) +
  geom_line(aes(group = country)) +
  facet_wrap(~ country)

Boxplots

ggplot(data = organdata,
            mapping = aes(x = country, y = donors)) +
  geom_boxplot()

coord_flip()

ggplot(data = organdata,
            mapping = aes(x = country, y = donors)) +
  geom_boxplot() +
  coord_flip()

reorder()

ggplot(data = organdata,
       mapping = aes(x = reorder(country, donors, na.rm = TRUE),
                     y = donors)) +
  geom_boxplot() +
  labs(x = NULL) +
  coord_flip()

Add color

ggplot(data = organdata,
       mapping = aes(x = reorder(country, donors, na.rm = TRUE),
                     y = donors, fill = world)) +
  geom_boxplot() +
  labs(x = NULL) +
  coord_flip() +
  theme(legend.position = "bottom")

Strip chart

ggplot(data = organdata,
       mapping = aes(x = reorder(country, donors, na.rm = TRUE),
                     y = donors, color = world)) +
  geom_point() +
  labs(x = NULL) +
  coord_flip() +
  theme(legend.position = "bottom")

Strip chart

ggplot(data = organdata,
       mapping = aes(x = reorder(country, donors, na.rm = TRUE),
                     y = donors, color = world)) +
  geom_jitter() +
  labs(x = NULL) +
  coord_flip() +
  theme(legend.position = "bottom")

Calculate summary statistics

(by_country <- organdata %>%
  group_by(consent_law, country) %>%
  summarize(donors_mean = mean(donors, na.rm = TRUE),
            donors_sd = sd(donors, na.rm = TRUE),
            gdp_mean = mean(gdp, na.rm = TRUE),
            health_mean = mean(health, na.rm = TRUE),
            roads_mean = mean(roads, na.rm = TRUE),
            cerebvas_mean = mean(cerebvas, na.rm = TRUE)))
## # A tibble: 17 x 8
## # Groups:   consent_law [?]
##    consent_law country        donors_mean donors_sd gdp_mean health_mean
##    <chr>       <chr>                <dbl>     <dbl>    <dbl>       <dbl>
##  1 Informed    Australia             10.6     1.14    22179.       1958.
##  2 Informed    Canada                14.0     0.751   23711.       2272.
##  3 Informed    Denmark               13.1     1.47    23722.       2054.
##  4 Informed    Germany               13.0     0.611   22163.       2349.
##  5 Informed    Ireland               19.8     2.48    20824.       1480.
##  6 Informed    Netherlands           13.7     1.55    23013.       1993.
##  7 Informed    United Kingdom        13.5     0.775   21359.       1561.
##  8 Informed    United States         20.0     1.33    29212.       3988.
##  9 Presumed    Austria               23.5     2.42    23876.       1875.
## 10 Presumed    Belgium               21.9     1.94    22500.       1958.
## 11 Presumed    Finland               18.4     1.53    21019.       1615.
## 12 Presumed    France                16.8     1.60    22603.       2160.
## 13 Presumed    Italy                 11.1     4.28    21554.       1757.
## 14 Presumed    Norway                15.4     1.11    26448.       2217.
## 15 Presumed    Spain                 28.1     4.96    16933.       1289.
## 16 Presumed    Sweden                13.1     1.75    22415.       1951.
## 17 Presumed    Switzerland           14.2     1.71    27233.       2776.
## # ... with 2 more variables: roads_mean <dbl>, cerebvas_mean <dbl>

Calculate summary statistics

(by_country <- organdata %>%
  group_by(consent_law, country) %>%
  summarize_if(is.numeric, funs(mean, sd), na.rm = TRUE) %>%
  ungroup())
## # A tibble: 17 x 28
##    consent_law country        donors_mean pop_mean pop_dens_mean gdp_mean
##    <chr>       <chr>                <dbl>    <dbl>         <dbl>    <dbl>
##  1 Informed    Australia             10.6   18318.         0.237   22179.
##  2 Informed    Canada                14.0   29608.         0.297   23711.
##  3 Informed    Denmark               13.1    5257.        12.2     23722.
##  4 Informed    Germany               13.0   80255.        22.5     22163.
##  5 Informed    Ireland               19.8    3674.         5.23    20824.
##  6 Informed    Netherlands           13.7   15548.        37.4     23013.
##  7 Informed    United Kingdom        13.5   58187.        24.0     21359.
##  8 Informed    United States         20.0  269330.         2.80    29212.
##  9 Presumed    Austria               23.5    7927.         9.45    23876.
## 10 Presumed    Belgium               21.9   10153.        30.7     22500.
## 11 Presumed    Finland               18.4    5112.         1.51    21019.
## 12 Presumed    France                16.8   58056.        10.5     22603.
## 13 Presumed    Italy                 11.1   57360.        19.0     21554.
## 14 Presumed    Norway                15.4    4386.         1.35    26448.
## 15 Presumed    Spain                 28.1   39666.         7.84    16933.
## 16 Presumed    Sweden                13.1    8789.         1.95    22415.
## 17 Presumed    Switzerland           14.2    7037.        17.0     27233.
## # ... with 22 more variables: gdp_lag_mean <dbl>, health_mean <dbl>,
## #   health_lag_mean <dbl>, pubhealth_mean <dbl>, roads_mean <dbl>,
## #   cerebvas_mean <dbl>, assault_mean <dbl>, external_mean <dbl>,
## #   txp_pop_mean <dbl>, donors_sd <dbl>, pop_sd <dbl>, pop_dens_sd <dbl>,
## #   gdp_sd <dbl>, gdp_lag_sd <dbl>, health_sd <dbl>, health_lag_sd <dbl>,
## #   pubhealth_sd <dbl>, roads_sd <dbl>, cerebvas_sd <dbl>,
## #   assault_sd <dbl>, external_sd <dbl>, txp_pop_sd <dbl>

Draw the plot

ggplot(data = by_country,
       mapping = aes(x = donors_mean,
                     y = reorder(country, donors_mean),
                     color = consent_law)) +
  geom_point(size = 3) +
  labs(x = "Donor Procurement Rate",
       y = "", color = "Consent Law") +
  theme(legend.position = "top")

Use facet instead of color

ggplot(data = by_country,
       mapping = aes(x = donors_mean,
                     y = reorder(country, donors_mean))) +
  geom_point(size = 3) +
  facet_wrap(~ consent_law, ncol = 1) +
  labs(x = "Donor Procurement Rate",
       y = "", color = "Consent Law")

Use facet instead of color

ggplot(data = by_country,
       mapping = aes(x = donors_mean,
                     y = reorder(country, donors_mean))) +
  geom_point(size = 3) +
  facet_wrap(~ consent_law, scales = "free_y", ncol = 1) +
  labs(x = "Donor Procurement Rate",
       y = "", color = "Consent Law")

Add standard deviation

ggplot(data = by_country,
       mapping = aes(x = reorder(country, donors_mean),
                     y = donors_mean)) +
  geom_pointrange(mapping = aes(ymin = donors_mean - donors_sd,
                                ymax = donors_mean + donors_sd)) +
  labs(x = "",
       y = "Donor Procurement Rate") +
  coord_flip()

geom_text()

ggplot(data = by_country,
       mapping = aes(x = roads_mean,
                     y = donors_mean)) +
  geom_point() +
  geom_text(mapping = aes(label = country))

geom_text()

ggplot(data = by_country,
       mapping = aes(x = roads_mean,
                     y = donors_mean)) +
  geom_point() +
  geom_text(mapping = aes(label = country), hjust = 0)

ggrepel::geom_text_repel()

elections_historic %>%
  select(2:7)
## # A tibble: 49 x 6
##     year winner                win_party ec_pct popular_pct popular_margin
##    <int> <chr>                 <chr>      <dbl>       <dbl>          <dbl>
##  1  1824 John Quincy Adams     D.-R.      0.322       0.309        -0.104 
##  2  1828 Andrew Jackson        Dem.       0.682       0.559         0.122 
##  3  1832 Andrew Jackson        Dem.       0.766       0.547         0.178 
##  4  1836 Martin Van Buren      Dem.       0.578       0.508         0.142 
##  5  1840 William Henry Harris… Whig       0.796       0.529         0.0605
##  6  1844 James Polk            Dem.       0.618       0.495         0.0145
##  7  1848 Zachary Taylor        Whig       0.562       0.473         0.0479
##  8  1852 Franklin Pierce       Dem.       0.858       0.508         0.0695
##  9  1856 James Buchanan        Dem.       0.588       0.453         0.122 
## 10  1860 Abraham Lincoln       Rep.       0.594       0.396         0.101 
## # ... with 39 more rows
p_title <- "Presidential Elections: Popular & Electoral College Margins"
p_subtitle <- "1824-2016"
p_caption <- "Data for 2016 are provisional."
x_label <- "Winner's share of Popular Vote"
y_label <- "Winner's share of Electoral College Votes"

library(ggrepel)

ggplot(elections_historic, aes(x = popular_pct,
                               y = ec_pct,
                               label = winner_label)) +
  geom_hline(yintercept = 0.5, size = 1.4, color = "gray80") +
  geom_vline(xintercept = 0.5, size = 1.4, color = "gray80") +
  geom_point() +
  geom_text_repel() +
  scale_x_continuous(labels = scales::percent) +
  scale_y_continuous(labels = scales::percent) +
  labs(x = x_label, y = y_label, title = p_title, subtitle = p_subtitle,
       caption = p_caption)

Label outliers only

ggplot(data = by_country,
       mapping = aes(x = gdp_mean, y = health_mean)) +
  geom_point() +
  geom_text_repel(data = filter(by_country, gdp_mean > 25000),
                  mapping = aes(label = country))

ggplot(data = by_country,
       mapping = aes(x = gdp_mean, y = health_mean)) +
  geom_point() +
  geom_text_repel(data = filter(by_country,
                                gdp_mean > 25000 | health_mean < 1500 |
                                  country %in% "Belgium"),
                  mapping = aes(label = country))